#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib
import urllib2
import random
import re
from bs4 import BeautifulSoup
import MySQLdb
import time
from DBUtils.PooledDB import PooledDB
import threading
import logging
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')

#初始化代理IP
agent_ip=None

try:
    pool = PooledDB(MySQLdb, 3, host='localhost', user='root', passwd="password", port=3306, db='taobao', connect_timeout = 5, charset='utf8')
    conn = pool.connection()
    curs = conn.cursor()
except Exception as e:
    logging.error("创建MySQL连接池失败：" + str(e))


#获取网页内容
def getContent( url, agent_ip = None ):
    try:
        user_agent = getUserAgent()
        headers = {"User-Agent": user_agent}

        #判断是否使用代理
        if agent_ip != None:
            agent_ip_ = agent_ip.split(":")
            proxies={agent_ip_[0]:agent_ip_[0]+"://"+agent_ip_[1]+":"+agent_ip_[2]}
            proxy_s=urllib2.ProxyHandler(proxies)
            opener=urllib2.build_opener(proxy_s)
            urllib2.install_opener(opener)

        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request, timeout = 5)
        return response.read()
    except Exception, e:
        logging.error("获取网页内容失败："+str(e))
        return False

#获取代理IP
def getIp( agent_ip = None ):
    url = "http://www.xicidaili.com/nn/"

    if agent_ip != None and not checkAgentIp( agent_ip ):
        #判断代理IP是否可用，如果不可用，则不使用代理
        agent_ip = None
        logging.warning("代理IP："+str(agent_ip)+"不可用，不使用代理方式获取网页内容")

    #获取网页内容
    content = getContent( url, agent_ip )
    try:
        if content:
            soup = BeautifulSoup(content, "lxml")
            ip_list = soup.select("#ip_list tr")

            sql = "insert into agent_ip (ip, port, addr, protocol) values "
            i = 0
            j = 0
            for item in ip_list:
                if j < 25:
                    #判断是否有对应的内容
                    if item.select(".country div.bar"):
                        ip = item.select("td")[1].get_text()
                        port = item.select("td")[2].get_text()
                        protocol = item.select("td")[5].get_text().lower()
                        #判断代理的IP是否可用

                        if checkAgentIp( protocol+":"+ip+":"+port ):
                            i = i + 1
                            sql = sql + "('"+ ip + "', '"+port+"', '--', '"+protocol+"'),"
                    j = j+1
                else:
                    break

            logging.info("本次爬取可用代理IP共计："+ str(i) + "条")
            if i > 0:
                curs.execute(sql[0:-1])
                conn.commit()
                return protocol+":"+ip+":"+port
            else:
                return None

    except Exception,e:
        logging.error("解析网页内容错误："+str(e))
        return None

#检测代理IP是否可用
def checkAgentIp( agent_ip = None ):
    try:
        user_agent = getUserAgent()
        headers = {"User-Agent": user_agent}

        url = "http://www.baidu.com"
        agent_ip_ = agent_ip.split(":")
        proxies={agent_ip_[0]:agent_ip_[0]+"://"+agent_ip_[1]+":"+agent_ip_[2]}
        proxy_s=urllib2.ProxyHandler(proxies)
        opener=urllib2.build_opener(proxy_s)
        urllib2.install_opener(opener)
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request, timeout = 3)
        content = response.read()

    except Exception as e:
        logging.info("代理IP：%s不可用" % agent_ip)
        return False

    regex = re.compile(r'baidu.com')
    if regex.search(content):
        logging.info("代理IP：%s可用" % agent_ip)
        return True
    else:
        logging.info("代理IP：%s不可用" % agent_ip)
        return False

#清理代理IP池中不可用的IP
def checkAgentIpUsable():
    logging.info("开始创建清理不可用的代理IP的线程...")
    while True:
        try:
            sql = "select protocol,ip, port from agent_ip;"
            count = curs.execute(sql)
            if count > 0:
                lists = curs.fetchall()
                for item in lists:
                    agent_ip = ":".join(item)
                    result = checkAgentIp( agent_ip )
                    if not result:
                        sql_delete = "delete from agent_ip where protocol = '%s' and ip = '%s' and port = '%s';" % ( item[0], item[1], item[2] )
                        curs.execute(sql_delete)
                        conn.commit()
            logging.info("清理代理IP池中不可用的IP共计"+str( count )+"条")

        except Exception as e:
            logging.error("清理代理IP池时出现错误：" + str(e))

        time.sleep(60)

def getUserAgent():
    user_agent_list = [
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)",
        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01",
        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.65 Safari/534.24",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7xs5D9rRDFpg2g",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.113 Safari/534.30",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.12 Safari/534.24",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.669.0 Safari/534.20",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.655.0 Safari/534.17",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.19 Safari/534.13",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10"
    ]
    return random.choice(user_agent_list)

def createLogFile():
    log_path = "./logs"
    path = log_path.strip()
    isExists = os.path.exists( path )
    if not isExists:
        os.makedirs( log_path )

    if os.path.isfile(log_path+"/agentip.log"):
        os.remove(log_path+"/agentip.log")

    os.mknod(log_path+"/agentip.log")

if __name__ == '__main__':

    createLogFile()
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='./logs/agentip.log', filemode='w')

    logging.info("开始爬取代理IP....")
    thread = threading.Thread(target=checkAgentIpUsable)
    thread.start()
    #thread.join()

    i = 0
    while True:
        i = i + 1
        logging.info("第"+ str(i) + "次爬取代理IP")
        agent_ip = getIp( agent_ip )
        #time.sleep(random.randint(1, 10))